Many of the figures and analysis methodology were based off of this tutorial from NYU. https://learn.gencore.bio.nyu.edu/rna-seq-analysis/gene-set-enrichment-analysis/
# Read Count Matrix
count.matrix <- read_tsv(trimws(sample_params$salmon_merged_gene_counts_file_path)) %>%
mutate(across(3:ncol(.), ~ as.integer(.x))) %>%
column_to_rownames("gene_id")
gene.reference <- dplyr::select(count.matrix, 1)
count.matrix <- count.matrix %>%
dplyr::select(-1)
# Design ColData Matrix
sample <- colnames(count.matrix)
colData <- data.frame(sample)
# Add Columns for Condition to colData
for (i in colnames(metadata)){
colData[i] <- metadata[i]
}
colData <- column_to_rownames(colData, "sample")
# Create DESeq Data Set
dds <- DESeqDataSetFromMatrix(
countData = count.matrix,
colData = colData,
design = as.formula(sample_params$design)
)
# Set Reference Level
reflevelcond <- trimws(sample_params$ref_level_cond)
reflevelvalue <- trimws(sample_params$ref_level_value)
dds[[reflevelcond]] <- relevel(dds[[reflevelcond]], ref = reflevelvalue)
# Subset out genes with low overall counts
keep <- rowSums(counts(dds) >= 10) >= 4
dds <- dds[keep,]
# Run DESeq
dds <- DESeq(dds)
# Regularized Log Transform
rld <- rlog(dds)
if (as.logical(trimws(sample_params$batch_correct))) {
# Check that condition for batch correction matches column metadata
if(all(trimws(sample_params$batch_cond) != colnames(metadata))){
stop(sprintf("The given condition for batch correction:%s, doesn't match any prior conditions: %s", sample_params$batch_cond, colnames(metadata)))
}
# Replicate / Batch Correction
batch <- trimws(sample_params$batch_cond)
batch.correct <- ComBat_seq(
counts = as.matrix(count.matrix),
batch = metadata[[batch]],
group = metadata[[colnames(metadata)[colnames(metadata) != batch]]]) %>%
as.data.frame()
# Create DESeq Data Set
batch.correct.dds <- DESeqDataSetFromMatrix(
countData = batch.correct,
colData = colData,
design = as.formula(sample_params$design))
# Set Factor Level
reflevelcond <- trimws(sample_params$ref_level_cond)
reflevelvalue <- trimws(sample_params$ref_level_value)
batch.correct.dds[[reflevelcond]] <- relevel(batch.correct.dds[[reflevelcond]], ref = reflevelvalue)
# Subset out genes with low overall counts
keep <- rowSums(counts(batch.correct.dds) >= 10) >= 4
batch.correct.dds <- batch.correct.dds[keep,]
# Run DESeq
batch.correct.dds <- DESeq(batch.correct.dds)
# Regularized Log Transform
batch.correct.rld <- rlog(batch.correct.dds)
# Cleanup
rm(batch, batch.correct)
}
# Cleanup
rm(keep)
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# Logic here uses the reference level condition and value inputs to create #
# contrast terms assumed to compare treatment samples with control samples #
# # Find relevant contrasts
contrasts <- list()
# Factor values of experimental condition
pattern <- factor(metadata[[reflevelcond]])
comp <- levels(pattern)
# Remove control state from experimental condition vector
comp <- comp[comp != reflevelvalue]
# Create Contrasts
for (i in 1:length(comp)){
contrasts[[i]] <- c(reflevelcond, comp[i], reflevelvalue)
}
# Pull results from DESeq object with contrasts
for (i in 1:length(contrasts)){
results[[i]] <- list()
results[[i]][["DataFrame"]] <- results(dds.for.analysis, contrast = contrasts[[i]]) %>%
data.frame() %>%
rownames_to_column("gene_id") %>%
mutate(gene_name = gene.reference[gene_id, 1], .before=baseMean) %>% # Add gene names to DESeq results matrices
column_to_rownames("gene_id")
results[[i]][["DESeqDataSet"]] <- results(dds.for.analysis, contrast = contrasts[[i]])
}
# View non-batch corrected
pca <- visualizePCsByCondition(
rlog = rld,
metadata = colData,
title = "Before Replicate Batch Correction")
pca$logScaleScree
pca$linearScaleScree
pca$selectedPCs
# View Rep effect corrected
if(as.logical(trimws(sample_params$batch_correct))){
pca.batch <- visualizePCsByCondition(
rlog = batch.correct.rld,
metadata = colData,
title = "After Replicate Batch Correction")
print(pca.batch$logScaleScree)
print(pca.batch$linearScaleScree)
print(pca.batch$selectedPCs)
}
Inputs for function call:
compileReport(result.obj = results[[1]], contrast = sprintf("%s_%s_vs_%s",
contrasts[[i]][1], contrasts[[i]][2], contrasts[[i]][3]),
l2fc_filter = 0.8, padj_filter = 0.1)
Transfering WORMBASE gene IDs to ENTREZID:
273 gene IDs were not transfered from WORMBASE to ENTREZID
11510 gene IDs were successfully transfered from WORMBASE to ENTREZID
Number of highly variable genes identified (with | L2FC | > 0.8 and padj < 0.1) is:
470
Number of highly upregulated genes identified (with L2FC > 0.8 and padj < 0.1) is:
361
Number of highly downregulated genes identified (with L2FC < -0.8 and padj < 0.1) is:
109
Table Reactome Pathway Over-representation Analysis of Upregulated Genes has no rows.
Table Reactome Pathway Over-representation Analysis of Downregulated Genes has no rows.
Inputs for function call:
compileReport(result.obj = results[[1]], contrast = sprintf("%s_%s_vs_%s",
contrasts[[i]][1], contrasts[[i]][2], contrasts[[i]][3]),
l2fc_filter = 0.8, padj_filter = 0.1)
Transfering WORMBASE gene IDs to ENTREZID:
273 gene IDs were not transfered from WORMBASE to ENTREZID
11510 gene IDs were successfully transfered from WORMBASE to ENTREZID
Number of highly variable genes identified (with | L2FC | > 0.8 and padj < 0.1) is:
470
Number of highly upregulated genes identified (with L2FC > 0.8 and padj < 0.1) is:
361
Number of highly downregulated genes identified (with L2FC < -0.8 and padj < 0.1) is:
109
Table Reactome Pathway Over-representation Analysis of Upregulated Genes has no rows.
Table Reactome Pathway Over-representation Analysis of Downregulated Genes has no rows.